################################
## Script: 10_analysis.R
## Purpose: This code runs the case study analysis.  
## Data In:
## 1) Fine tuned estimator annotations
## data/gpt_annotations_finetune_full.rds
## 2) Full articles
## data/bioweapons_casestudy_5_20_2024_sbert_embeddings.json
## 3) ngram output
## data/ngrams_edges.rds
## Data Out:
## 1) figures/percent_matched_ngram_comparison.pdf
## 2) figures/percent_matched.pdf
## 3) figures/overtime_bioweapons_counts.pdf
## Notes:


library(igraph) ## 2.0.3
library(tidyverse) ## 2.0.0
library(lubridate) ## 1.9.4

matches <- readRDS("data/gpt_annotations_finetune_full.rds")

## total articles 
total_art <- jsonlite::stream_in(file("data/bioweapons_casestudy_5_20_2024_sbert_embeddings.json"))

## ngrams (for comparison)
ngrams <- readRDS("data/ngrams_edges.rds")

matches <- matches %>%
  filter(gpt4o_finetune == "YES")


####################################
### Create Variables ###############
####################################

total_art$language <- ifelse(
  total_art$source_name %in%
    c("ukrinform_en",
      "gordonua_en",
      "new_york_times",
      "the_mind_unleashed",
      "the_gateway_pundit",
      "new_york_post",
      "tsn_en",
      "cbs_news",
      "cnbc",
      "huffpost",
      "daily_caller",
      "nbc_news",
      "100_percent_fed_up",
      "fox_news",
      "star_tribune",
      "natural_news",
      "the_federalist_papers",
      "censor_net_en",
      "infowars",
      "lb_ua_en",
      "bykvu_eng",
      "the_hill",
      "npr",
      "the_political_insider",
      "slate",
      "fakty_com_en",
      "stillness_in_the_storm",
      "abc_news",
      "crooks_and_liars",
      "politico",
      "cnn",
      "interfax_ua_en",
      "wall_street_journal",
      "occupy_democrats",
      "judicial_watch",
      "latimes",
      "washington_post",
      "usa_today",
      "zerohedge",
      "business_insider",
      "msnbc",
      "clash_daily",
      "pravda_com_en",
      "bipartisan_report",
      "ijr",
      "palmer_report",
      "yahoo_news",
      "pbs",
      "tass",
      "pravda",
      "rt",  
      "sputnik"), "english",
  ifelse(total_art$source_name %in%
           c("fakty_ru",
             "24tk_ru",
             "rbc_ru",
             "pravda_com_ru",
             "nv_ru",
             "rt_ru",
             "pravda_ru",
             "zn_ru",
             "bykvu_ru",
             "tsn_ru",
             "gordonua_ru",
             "ukrinform_ru",
             "fakty_com_ru",
             "unian_ru" ,
             "tass_ru",
             "censor_net_ru"),
         "russian",
         "ukranian")
)


total_art$source_type <- ifelse(total_art$source_name %in% 
                                  c("100_percent_fed_up",
                                    "bipartisan_report",
                                    "clash_daily",
                                    "crooks_and_liars",
                                    "daily_caller",
                                    "ijr",
                                    "infowars",
                                    "judicial_watch",
                                    "natural_news",
                                    "occupy_democrats",
                                    "palmer_report",
                                    "stillness_in_the_storm",
                                    "the_federalist_papers",
                                    "the_gateway_pundit",
                                    "the_mind_unleashed",
                                    "the_political_insider",
                                    "zerohedge"),
                                "low quality",
                                ifelse(total_art$source_name %in%
                                         c("abc_news",
                                           "business_insider",
                                           "cbs_news",
                                           "cnbc",
                                           "cnn",
                                           "fox_news",
                                           "huffpost",
                                           "latimes",
                                           "msnbc",
                                           "nbc_news",
                                           "new_york_post",
                                           "new_york_times",
                                           "npr",
                                           "pbs",
                                           "politico",
                                           "slate",
                                           "star_tribune",
                                           "the_hill",
                                           "usa_today",
                                           "wall_street_journal",
                                           "washington_post",
                                           "yahoo_news"), "most popular US",
                                       ifelse(total_art$source_name %in%
                                                c("pravda",
                                                  "pravda_ru",
                                                  "rt",
                                                  "rt_ru",
                                                  "sputnik",
                                                  "sputnik_cn",
                                                  "tass",
                                                  "tass_ru"), 
                                              "russian state media",
                                              "ukranian")))






matches$ego_source_type <- total_art$source_type[match(matches$ego_id,
                                                       total_art$article_id)]
matches$alter_source_type <- total_art$source_type[match(matches$alter_id,
                                                         total_art$article_id)]


ngrams$ego_source_type <- total_art$source_type[match(ngrams$ego_id,
                                                      total_art$article_id)]
ngrams$alter_source_type <- total_art$source_type[match(ngrams$alter_id,
                                                        total_art$article_id)]


## create variable:
## whether or not article was to matched to russian/ ukranian
## US source
## by gpt4o fine tuned annotator 
## or ngram measure 
## for ngram measure we use cutoff of .2
## because we found found .2 was best performing estimator
## see Table 1 in main results 
## .2 was the minimum to be included in the ngram_pairs
## note that for both measures
## this will include russian articles matched to other russian articles
## but we do the analysis by source type below 

total_art$matched_russian <- ifelse(total_art$article_id %in% matches$ego_id[matches$alter_source_type == 
                                                                                      "russian state media"] |
                                             total_art$article_id %in% matches$alter_id[matches$ego_source_type == 
                                                                                          "russian state media"],
                                           "yes", "no")
total_art$matched_russian_ngram <- ifelse(total_art$article_id %in% ngrams$ego_id[ngrams$alter_source_type == 
                                                                                      "russian state media"] |
                                             total_art$article_id %in% ngrams$alter_id[ngrams$ego_source_type == 
                                                                                          "russian state media"],
                                           "yes", "no")

total_art$matched_ukranian <- ifelse(total_art$article_id %in% matches$ego_id[matches$alter_source_type == 
                                                                                       "ukranian"] |
                                              total_art$article_id %in% matches$alter_id[matches$ego_source_type == 
                                                                                           "ukranian"],
                                            "yes", "no")

total_art$matched_ukranian_ngram <- ifelse(total_art$article_id %in% ngrams$ego_id[ngrams$alter_source_type == 
                                                                                       "ukranian"] |
                                              total_art$article_id %in% ngrams$alter_id[ngrams$ego_source_type == 
                                                                                           "ukranian"],
                                            "yes", "no")


## whether matched to US article
total_art$matched_US <- NA
total_art$matched_US_ngram <- NA
  
for(i in 1:nrow(total_art)){
  mat_ego <- matches$ego_id[matches$alter_source != total_art$source_name[i] &
                       (matches$alter_source_type == "most popular US" |
                          matches$alter_source_type == "low quality")]
  mat_alter <- matches$alter_id[matches$ego_source != total_art$source_name[i] &
                                (matches$ego_source_type == "most popular US" |
                                   matches$ego_source_type == "low quality")]
  mat_ngram_ego <- ngrams$ego_id[ngrams$alter_source != total_art$source_name[i] &
                                    (ngrams$alter_source_type == "most popular US" |
                                       ngrams$alter_source_type == "low quality")]
  mat_ngram_alter <- ngrams$alter_id[ngrams$ego_source != total_art$source_name[i] &
                                  (ngrams$ego_source_type == "most popular US" |
                                     ngrams$ego_source_type == "low quality")]
  total_art$matched_US[i] <- ifelse(total_art$article_id[i] %in% c(mat_ego, mat_alter),
                                    "yes", "no")
  
  total_art$matched_US_ngram[i] <- ifelse(total_art$article_id[i] %in% c(mat_ngram_ego,
                                                                         mat_ngram_alter),
                                          "yes", "no")
}


## create combined match type
total_art$match_final <- paste(total_art$matched_russian,
                                total_art$matched_ukranian,
                                total_art$matched_US,
                                sep = "_")
total_art$match_final <- dplyr::recode(total_art$match_final,
                                       `no_no_no` = "Non-Matched",
                                       `no_no_yes` = "Matched to US Source",
                                       `no_yes_no` = "Matched to Ukranian Source",
                                       `yes_no_no` = "Matched to Russian Source",
                                       `yes_no_yes` = "Matched to US and Russian Source",
                                       `yes_yes_no` = "Matched to Ukranian and Russian Source",
                                       `yes_yes_yes` = "Matched to Ukranian, Russian, and US Source",
                                       `no_yes_yes` = "Matched to Ukranian and US Source")
total_art$match_final_c <- dplyr::recode(total_art$match_final,
                                         `Matched to US and Russian Source` = "Multiple Source Types",
                                         `Matched to Ukranian and Russian Source` = "Multiple Source Types",
                                         `Matched to Ukranian, Russian, and US Source` = "Multiple Source Types",
                                         `Matched to Ukranian and US Source` = "Multiple Source Types",)


###########################################
### Plots ##################################
###########################################

## estimates referenced in text,
## results section 
table(total_art$source_type)
table(total_art$source_type,
      total_art$matched_russian)
52 / (317 + 52) ## 14.1%
37 / (646 + 37) ## 5.4% 

## 1. count of articles over time
## Included in main text, Figure 1
total_art$final_date <- as.Date(total_art$final_date)

plot <- total_art %>%
  group_by(final_date) %>%
  summarize(n = n())

ggplot(plot,
       mapping = aes(x = final_date,
                     y = n)) +
  geom_line() +
  theme_bw() +
  labs(x = NULL,
       y = "Count of Articles") +
  annotate(geom = "text",
           label = "Russia calls for U.N. Security\n Council meeting on bioweapons",
           x = as.Date("2022-2-20"),
           y = 220) +
  annotate(geom = "text",
           label = "Russia accuses Hunter Biden\n of funding 'bioweapons program'",
           x = as.Date("2022-4-12"),
           y = 200)
## figures/overtime_bioweapons_counts.pdf
## 5x9 landscape

## 2. overall percent of stories shared with russian/ukranian state media/any by US source and type
## main text, figure 2
plot <- total_art %>%
  filter(source_type == "low quality" |
           source_type == "most popular US") %>%
  group_by(source_name) %>%
  summarize(n = n(),
            source_type = unique(source_type),
            matched_russian_n = sum(matched_russian == "yes"),
            `Matched to\n Russian State Media` = matched_russian_n / n,
            matched_ukranian_n = sum(matched_ukranian == "yes"),
            `Matched to\n Ukranian Media` = matched_ukranian_n / n,
            matched_us = sum(matched_US == "yes"),
            `Matched to US Source` = matched_us / n) %>%
  gather(key = "type",
         value = "percent",
         -source_name, -n, -source_type, -matched_russian_n,
         -matched_ukranian_n,
         -matched_us)

plot$source_type <- dplyr::recode(plot$source_type,
                                  `low quality` = "Low Quality US News",
                                  `most popular US` = "Popular US News")

ggplot(plot[plot$type != "Matched to Any Other Article", ],
       mapping = aes(y = percent,
                     x = type)) +
  geom_boxplot(width = .4) +
  facet_wrap(~ source_type) +
  theme_bw() +
  scale_y_continuous(labels = scales::percent_format()) +
  labs(y = "Percent of Articles by News Publication",
       x = NULL)
## figures/percent_matched.pdf
## 5x7


### 3.
#overall percent of stories shared 
## with russian/ukranian state media/US source 
## by source type and esimator (gpt4o annotator, ngram)
## Included in main text, Figure 3 
plot <- total_art %>%
  filter(source_type == "low quality" |
           source_type == "most popular US") %>%
  group_by(source_name) %>%
  summarize(n = n(),
            source_type = unique(source_type),
            matched_russian_n = sum(matched_russian == "yes"),
            `Matched to\nRussian State Media` = matched_russian_n / n,
            matched_ukranian_n = sum(matched_ukranian == "yes"),
            `Matched to\nRussian State Media Ngram` = sum(matched_russian_ngram == "yes") / n,
            `Matched to\nUkranian Media` = matched_ukranian_n / n,
            `Matched to\nUkranian Media Ngram` = sum(matched_ukranian_ngram == "yes") / n,
            matched_us = sum(matched_US == "yes"),
            `Matched to\nUS Source` = matched_us / n,
            `Matched to\nUS Source Ngram` = sum(matched_US_ngram == "yes") / n,) %>%
  gather(key = "type",
         value = "percent",
         -source_name, -n, -source_type, -matched_russian_n,
         -matched_ukranian_n,
         -matched_us)

plot$source_type <- dplyr::recode(plot$source_type,
                                  `low quality` = "Low Quality US News",
                                  `most popular US` = "Popular US News")
plot$ngram <- ifelse(grepl("gram", plot$type),
                     "Ngram Esimator", "Fine Tuned\nGpt4o Estimator")
plot$type <- gsub(" Ngram", "", plot$type)

ggplot(plot[plot$type != "Matched to Any Other Article", ],
       mapping = aes(y = percent,
                     x = ngram)) +
  geom_boxplot(width = .4) +
  facet_grid(type ~ source_type,
             scales = "free") +
  theme_bw() +
  scale_y_continuous(labels = scales::percent_format()) +
  labs(y = "Percent of Articles by News Publication",
       x = NULL)
## figures/percent_matched_ngram_comparison.pdf
## 5x7